#coding=utf-8
import random
import numpy as np
from cs231n.data_utils import load_CIFAR10
import matplotlib.pyplot as plt
import time

#设置背景画布
plt.rcParams['figure.figsize'] = (10.0, 8.0) #设置默认的汇图大小
plt.rcParams['image.interpolation'] = 'nearest' #设置插补方式
plt.rcParams['image.cmap'] = 'gray' #设置颜色style
#加载数据集
cifar10_dir = '/home/zxy/PycharmProjects/cs231n/assignment1/cs231n/datasets/cifar-10-batches-py'
X_train, y_train, X_test, y_test = load_CIFAR10(cifar10_dir)
#打印一下数据集的shape
print ('Training data shape: ', X_train.shape)
print('Trainging labels shape', y_train.shape)
print('Test data shape', X_test.shape)
print('Test label shape', y_test.shape)
#从数据集抓一些例子图片出来看看
classes = ['plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']
num_classes = len(classes)
sample_per_class = 7
for y, cls in enumerate(classes):
    idxs = np.flatnonzero(y_train == y) #返回非0元素的位置，也就是训练集的图片的位置
    idxs = np.random.choice(idxs, sample_per_class, replace=False) #从训练集图片的位置随机选择7张图片，Replace为True代表有重复的
    for i, idx in enumerate(idxs):
        plt_idx = i * num_classes + y + 1 #画图是一列一列的画
        plt.subplot(sample_per_class, num_classes, plt_idx) #设置当前这张图片在画布中的位置
        plt.imshow(X_train[idx].astype('uint8')) #展示图像
        plt.axis('off') #不显示坐标
        if i == 0: #第一行展示类别名
            plt.title(cls)
plt.show()

#在训练集和测试集中抓取一个子集来进行训练和测试
num_training = 5000 #抽取的训练集的图片张数
mask = list(range(num_training)) #前5000个
X_train = X_train[mask]
y_train = y_train[mask]

num_test = 500
mask = list(range(num_test))
X_test = X_test[mask] #前500个
y_test = y_test[mask]

#将图片数据转化成列向量
X_train = np.reshape(X_train, (X_train.shape[0], -1))
X_test = np.reshape(X_test, (X_test.shape[0], -1))
print(X_train.shape, X_test.shape)

from cs231n.classifiers import k_nearest_neighbor
#创建一个KNN分类器
classifier = k_nearest_neighbor.KNearestNeightbor()
classifier.train(X_train, y_train)
#计算测试集每一个样本和训练集所有样本的距离
dists = classifier.compute_distance_no_loops(X_test)
print(dists.shape)
#可视化距离矩阵，每一行都是一个测试例子和所有训练集的距离
plt.imshow(dists, interpolation='none')
plt.show()

#利用训练好的模型来进行预测
y_test_pred = classifier.predict_labels(dists, k=1)
#取k=1, 计算准确率
num_correct = np.sum(y_test_pred == y_test)
accuracy = float(num_correct) / num_test
print('Got %d / %d correct => accurcy: %f' % (num_correct, num_test, accuracy))

#取k=5, 计算准确率
y_test_pred = classifier.predict_labels(dists, k=5)
num_correct = np.sum(y_test_pred == y_test)
accuracy = float(num_correct) / num_test
print('Got %d / %d correct => accurcy: %f' % (num_correct, num_test, accuracy))

#验证3种方法计算出的距离是否一样
dists_one = classifier.compute_distance_one_loop(X_test)
difference = np.linalg.norm(dists - dists_one, ord='fro') #求距离差的范数
print('Difference is: %f' % (difference))
if difference < 0.001:
    print('Good! The distance matrices are the same')
else:
    print('Uh-oh! The distance matrices are different')


dists_two = classifier.compute_distance_no_loops(X_test)
difference = np.linalg.norm(dists - dists_two, ord='fro') #求距离差的范数
print('Difference is: %f' % (difference))
if difference < 0.001:
    print('Good! The distance matrices are the same')
else:
    print('Uh-oh! The distance matrices are different')

#计算不同的计算方式的耗时
def time_function(f, *args):
    tic = time.time()
    f(args)
    toc = time.time()
    return toc - tic

#交叉验证
num_folds = 5
k_choices = [1, 3, 5, 8, 10, 12, 15, 20, 50, 100]
X_train_folds = []
Y_train_folds = []
X_train_ = y_train.reshape(-1, 1)
X_train_folds, Y_train_folds = np.array_split(X_train, 5), np.array_split(y_train, 5)

#保存不同的k值在交叉验证的时候的准确率，每一个k_to_accuracies[k]都应该是长度为num_folds的lists
k_to_accuracies = {}
#计算k_to_accuracies
for k_ in k_choices:
    k_to_accuracies.setdefault(k_, []) #设置默认值为空
for i in range(num_folds):
    classifier = k_nearest_neighbor.KNearestNeightbor()
    X_val_train = np.vstack(X_train_folds[0:i] + X_train_folds[i+1:]) #重组训练集
    y_val_train = np.vstack(Y_train_folds[0:i] + Y_train_folds[i+1:]) #同理
    y_val_train = y_val_train[:,0] #取二维数组中第一维的所有数据
    classifier.train(X_val_train, y_val_train)
    for k_ in k_choices:
        y_val_pred = classifier.predict(X_train_folds[i], k=k_)
        num_correct = np.sum(y_val_pred == Y_train_folds[i][:,0])
        accuracy = float(num_correct) / len(y_val_pred)
        k_to_accuracies[k_] = k_to_accuracies[k_] + [accuracy]

#输出准确率
for k in sorted(k_to_accuracies):
    for accuracy in k_to_accuracies[k]:
        print('k = %d, accurcy = %f' % (k, accuracy))

#绘制原始观测数据
for k in k_choices:
    accuracy = k_to_accuracies[k]
    plt.scatter([k]*len(accuracy), accuracy)
#用对应于标准变差的误差条绘制趋势线
accuracies_mean = np.array([np.mean(v) for k,v in sorted(k_to_accuracies.items())])
accuracies_std = np.array([np.std(v) for k,v in sorted(k_to_accuracies.items())])
plt.errorbar(k_choices, accuracies_mean, yerr=accuracies_std) #绘制误差棒图
plt.title('Cross-validation on k')
plt.xlabel('k')
plt.ylabel('Cross-validtion accuracy')
plt.show()

#选取最好的k来做预测
best_K = 10
classifer = k_nearest_neighbor.KNearestNeightbor()
classifier.train(X_train, y_train)
y_test_pred = classifier.predict(X_test, k=best_K)

#计算和打印最好的k对应的准确率
num_correct = np.sum(y_test_pred == y_test)
accuracy = float(num_correct) / num_test
print('Got %d / %d correct => accuracy: %f' % (num_correct, num_test, accuracy))


















